Simulate population

n_pop <- 1e6
X <- cbind(
  1,
  rnorm(n_pop), rnorm(n_pop), rnorm(n_pop)
)
beta <- c(-1.5, log(1.5), log(0.1), log(3))
p <- plogis(X %*% beta)
y <- rbinom(n = n_pop, size = 1, prob = p)
df <- data.frame(y, X[, -1], p)
head(df)
##   y          X1         X2         X3          p
## 1 1 -0.67141163 -1.9508952 -1.1074067 0.81805985
## 2 0 -0.63731704  0.1283204  0.1669023 0.13348185
## 3 0  0.27256001  0.4036060 -0.3696718 0.06151709
## 4 0  1.44706901  0.8588238 -0.4340875 0.03332119
## 5 1 -0.03250427 -1.4420543  0.7990098 0.93614021
## 6 1  0.27473031 -0.1656333  0.4881297 0.38438989

Define “fitted” models

Two models: one perfect and one a little bit overfit.

M1: perfect model

m1 <- \(x) as.vector(
  plogis(
    as.matrix(cbind(1, x[, stringr::str_detect(colnames(x), "^X")])) %*% beta
  )
)

M2: not so great model

beta2 <- c(-2.8, log(2), log(0.005), log(5))
m2 <- \(x) as.vector(
  plogis(
    as.matrix(cbind(1, x[, stringr::str_detect(colnames(x), "^X")])) %*% beta2
  )
)

calibration curves

ix <- sample(1:n_pop, 1e5)
.df <- df[ix,]
logit <- \(x) log(x/(1-x))
.df$p_hat1 <- m1(.df)
.df$lp1 <- logit(.df$p_hat1)
.df$p_hat2 <- m2(.df)
.df$lp2 <- logit(.df$p_hat2)

cal1 <- glm(y ~ rms::rcs(lp1, 5), data = .df, family = binomial)
cal2 <- glm(y ~ rms::rcs(lp2, 5), data = .df, family = binomial)

.df$cal1_fitted <- predict(cal1, data = .df, type = "response")
.df$cal2_fitted <- predict(cal2, data = .df, type = "response")

Treatment errors for each threshold

For each threshold \(t\), a treatment error occurs when \(\hat{p}\) is on the wrong side of \(t\), i.e.

\[ \begin{align} &p < t \text{ | } \hat{p} > t \implies \text{overtreatment} \\ &p > t \text{ | } \hat{p} < t \implies \text{undertreatment} \end{align} \]

We will compute the probability of each event for models M1 and M2, using both the true underlying probability \(p\) and the calibration estimate \(\tilde{p} = \Pr(Y=1 | \hat{p} = x) \ , \text{ for } x \in (0,1)\).

library(tidyverse)
thresholds <- seq(0.02, 0.5, 0.02)
get_treatment_errors <- function(thr, df_val) {
  tibble(
    type = c(
      "P(p > t | p_hat < t)",
      "P(p < t | p_hat > t)",
      "P(p > t | p_hat < t)",
      "P(p < t | p_hat > t)"
    ),
    ref_type = c(
      "underlying p",
      "underlying p",
      "calibration",
      "calibration"
    ),
    type_label = c(
      "undertreatment",
      "overtreatment",
      "undertreatment",
      "overtreatment"
    ),
    model1 = c(
      mean(df_val$p[df_val$p_hat1 < thr] > thr),
      mean(df_val$p[df_val$p_hat1 > thr] < thr),
      mean(df_val$cal1_fitted[df_val$p_hat1 < thr] > thr),
      mean(df_val$cal1_fitted[df_val$p_hat1 > thr] < thr)
    ),
    model2 = c(
      mean(df_val$p[df_val$p_hat2 < thr] > thr),
      mean(df_val$p[df_val$p_hat2 > thr] < thr),
      mean(df_val$cal2_fitted[df_val$p_hat2 < thr] > thr),
      mean(df_val$cal2_fitted[df_val$p_hat2 > thr] < thr)
    ),
    thr = rep(thr, 4)
  )
}
d <- map_df(thresholds, get_treatment_errors, df_val = .df)

plot

Plot everything to see if calibration \(\tilde{p}\) gives a nice (consistent) estimate of the over-/under-treatment probabilities.

theme_set(theme_minimal())
d %>% 
  na.omit() %>% 
  pivot_longer(cols = contains('model')) %>% 
  ggplot(aes(thr, value, color = name, shape = ref_type, size = ref_type)) +
  geom_point() +
  facet_wrap(~type) +
  scale_shape_manual(values = c(19, 21)) +
  scale_size_manual(values = c(1, 3)) +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent,
                     breaks = scales::pretty_breaks(10)) +
  labs(x = "Decision threshold", y = NULL)

Apparently

\[ \Pr(p < t \text{ | } \hat{p} > t) \not\approx \Pr(\tilde{p} < t \text{ | } \hat{p} > t) \] If we expand the expression for the different \(p\)’s, the math gets a bit confusing (conditional probability of conditional probabilities).

possible reason

Calibration looks at \(\hat{p}\), which hopefully approximates \(p\), but \(\hat{p}\) could be arbitrarily off (e.g. case-mix variation).

.df %>% 
  sample_n(2e4) %>% 
  ggplot(aes(p_hat2)) +
  geom_point(aes(y = p), alpha = 0.3) +
  geom_point(aes(y = cal2_fitted), color = "blue") +
  geom_abline(linetype = 2, color = "red", size = 2) +
  labs(x = "Predicted probability (model 2)",
       y = "Underlyng p or calibration p")

How can we capture the spread around the calibration curve? Like a “predictive interval for calibration p”.

What if we re-fit the model in the validation data and use its predictions as “observed proportions”? (instead of using the calibration p)

n_test <- 1e5
.df <- df[sample(1:n_pop, n_test), ]
.df$p_hat1 <- m1(.df)
.df$lp1 <- logit(.df$p_hat1)
.df$p_hat2 <- m2(.df)
.df$lp2 <- logit(.df$p_hat2)
test_fit <- glm(y ~ X1 + X2 + X3, data = .df, family = binomial)
.df$cal1_fitted <- predict(test_fit, type = 'response')
.df$cal2_fitted <- predict(test_fit, type = 'response')
d <- map_df(thresholds, get_treatment_errors, df_val = .df) %>% 
  mutate(
    ref_type = ifelse(
      ref_type == "calibration",
      "test fit",
      ref_type
    )
  )

d %>% 
  na.omit() %>% 
  pivot_longer(cols = contains('model')) %>% 
  ggplot(aes(thr, value, color = name, shape = ref_type, size = ref_type)) +
  geom_point() +
  facet_wrap(~type) +
  scale_shape_manual(values = c(19, 21)) +
  scale_size_manual(values = c(1, 3)) +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent,
                     breaks = scales::pretty_breaks(10)) +
  labs(x = "Decision threshold", y = NULL)

the data set needed to estimate over-/under-treatment probabilities must be large enough for us to get a second estimate \(\hat{p_2} = \Pr(Y_{\text{test}} = 1 | X_{\text{test}})\), which is analogous to training another model on the test set (a bit like model revision).

Predicted versus calibration/underlying p

.df %>% 
  sample_n(2e4) %>% 
  ggplot(aes(p_hat2)) +
  geom_point(aes(y = p), alpha = 0.8) +
  geom_point(aes(y = cal2_fitted), alpha = 0.5, color = "blue", 
             shape = 21, size = 2) +
  geom_abline(linetype = 2, color = "red", size = 2) +
  labs(x = "Predicted probability (model 2)",
       y = "Underlyng p or calibration p")